From f91cf5979becb9058e1b94f39f78039b80fb9244 Mon Sep 17 00:00:00 2001 From: "cl349@freefall.cl.cam.ac.uk" Date: Fri, 30 Jul 2004 18:48:57 +0000 Subject: [PATCH] bitkeeper revision 1.1108.44.1 (410a9819I8nRg4cqB44agt4G5KysGA) add network backend driver for 2.6 --- .rootkeys | 12 +- linux-2.4.26-xen-sparse/mkbuildtree | 2 +- .../arch/xen/i386/mm/hypervisor.c | 97 +- linux-2.6.7-xen-sparse/drivers/xen/Makefile | 4 +- .../drivers/xen/net/Makefile | 2 - .../drivers/xen/netback/Makefile | 2 + .../drivers/xen/netback/common.h | 97 + .../drivers/xen/netback/control.c | 65 + .../drivers/xen/netback/interface.c | 288 +++ .../drivers/xen/netback/netback.c | 778 +++++++ .../drivers/xen/{net => netfront}/Kconfig | 0 .../drivers/xen/netfront/Makefile | 2 + .../{net/network.c => netfront/netfront.c} | 0 .../include/asm-xen/asm-i386/hypervisor.h | 15 +- linux-2.6.7-xen-sparse/mm/page_alloc.c | 2017 +++++++++++++++++ 15 files changed, 3369 insertions(+), 12 deletions(-) delete mode 100644 linux-2.6.7-xen-sparse/drivers/xen/net/Makefile create mode 100644 linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile create mode 100644 linux-2.6.7-xen-sparse/drivers/xen/netback/common.h create mode 100644 linux-2.6.7-xen-sparse/drivers/xen/netback/control.c create mode 100644 linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c create mode 100644 linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c rename linux-2.6.7-xen-sparse/drivers/xen/{net => netfront}/Kconfig (100%) create mode 100644 linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile rename linux-2.6.7-xen-sparse/drivers/xen/{net/network.c => netfront/netfront.c} (100%) create mode 100644 linux-2.6.7-xen-sparse/mm/page_alloc.c diff --git a/.rootkeys b/.rootkeys index 12babc83d8..b35246e7c7 100644 --- a/.rootkeys +++ b/.rootkeys @@ -203,9 +203,14 @@ 3e5a4e651TH-SXHoufurnWjgl5bfOA linux-2.6.7-xen-sparse/drivers/xen/console/console.c 40f56239KYxO0YabhPzCTeUuln-lnA linux-2.6.7-xen-sparse/drivers/xen/evtchn/Makefile 40f56239DoibTX6R-ZYd3QTXAB8_TA linux-2.6.7-xen-sparse/drivers/xen/evtchn/evtchn.c -40f56239lrg_Ob0BJ8WBFS1zeg2CYw linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig -40f56239Wd4k_ycG_mFsSO1r5xKdtQ linux-2.6.7-xen-sparse/drivers/xen/net/Makefile -405853f6nbeazrNyEWNHBuoSg2PiPA linux-2.6.7-xen-sparse/drivers/xen/net/network.c +410a9817HEVJvred5Oy_uKH3HFJC5Q linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile +4097ba831lpGeLlPg-bfV8XarVVuoQ linux-2.6.7-xen-sparse/drivers/xen/netback/common.h +4097ba83wvv8yi5P5xugCUBAdb6O-A linux-2.6.7-xen-sparse/drivers/xen/netback/control.c +4097ba83byY5bTSugJGZ1exTxIcMKw linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c +4087cf0dGmSbFhFZyIZBJzvqxY-qBw linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c +40f56239lrg_Ob0BJ8WBFS1zeg2CYw linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig +40f56239Wd4k_ycG_mFsSO1r5xKdtQ linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile +405853f6nbeazrNyEWNHBuoSg2PiPA linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c 4108f5c1ppFXVpQzCOAZ6xXYubsjKA linux-2.6.7-xen-sparse/drivers/xen/privcmd/Makefile 3e5a4e65IUfzzMu2kZFlGEB8-rpTaA linux-2.6.7-xen-sparse/drivers/xen/privcmd/privcmd.c 40f56239YAjS52QG2FIAQpHDZAdGHg linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/desc.h @@ -251,6 +256,7 @@ 40f5623cBiQhPHILVLrl3xa6bDBaRg linux-2.6.7-xen-sparse/include/asm-xen/xen.h 3f689063BoW-HWV3auUJ-OqXfcGArw linux-2.6.7-xen-sparse/include/asm-xen/xen_proc.h 40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.7-xen-sparse/mkbuildtree +410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.7-xen-sparse/mm/page_alloc.c 40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs 3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile 401d7e160vaxMBAUSLSicuZ7AQjJ3w tools/examples/Makefile diff --git a/linux-2.4.26-xen-sparse/mkbuildtree b/linux-2.4.26-xen-sparse/mkbuildtree index a2f3a2cd80..46b992a988 100755 --- a/linux-2.4.26-xen-sparse/mkbuildtree +++ b/linux-2.4.26-xen-sparse/mkbuildtree @@ -238,4 +238,4 @@ cd ${AD}/arch/xen/drivers/dom0 ln -sf ../../../../${LINUX_26}/drivers/xen/privcmd/privcmd.c core.c cd ${AD}/arch/xen/drivers/netif/frontend -ln -sf ../../../../../${LINUX_26}/drivers/xen/net/network.c main.c +ln -sf ../../../../../${LINUX_26}/drivers/xen/netfront/netfront.c main.c diff --git a/linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c index 35e8c3e312..173a6a0891 100644 --- a/linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c +++ b/linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c @@ -8,6 +8,8 @@ #include #include +#include +#include #include #include #include @@ -126,8 +128,11 @@ static inline void __flush_page_update_queue(void) #endif idx = 0; wmb(); /* Make sure index is cleared first to avoid double updates. */ - if (unlikely(HYPERVISOR_mmu_update(update_queue, _idx, NULL) < 0)) - panic("Failed to execute MMU updates"); + if ( unlikely(HYPERVISOR_mmu_update(update_queue, _idx, NULL) < 0) ) + { + printk(KERN_ALERT "Failed to execute MMU updates.\n"); + BUG(); + } } void _flush_page_update_queue(void) @@ -262,3 +267,91 @@ void queue_machphys_update(unsigned long mfn, unsigned long pfn) increment_index(); spin_unlock_irqrestore(&update_lock, flags); } + +#ifdef CONFIG_XEN_PHYSDEV_ACCESS + +unsigned long allocate_empty_lowmem_region(unsigned long pages) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long *pfn_array; + unsigned long vstart; + unsigned long i; + int ret; + unsigned int order = get_order(pages*PAGE_SIZE); + + vstart = __get_free_pages(GFP_KERNEL, order); + if ( vstart == 0 ) + return 0UL; + + pfn_array = vmalloc((1<pte_low >> PAGE_SHIFT; + queue_l1_entry_update(pte, 0); + phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = 0xdeadbeef; + } + + flush_page_update_queue(); + + ret = HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, + pfn_array, 1<>PAGE_SHIFT); + phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = pfn_array[i]; + } + + flush_page_update_queue(); + + vfree(pfn_array); + + free_pages(vstart, order); +} + +#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ diff --git a/linux-2.6.7-xen-sparse/drivers/xen/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/Makefile index 2b3219fd4e..ae06ee79a7 100644 --- a/linux-2.6.7-xen-sparse/drivers/xen/Makefile +++ b/linux-2.6.7-xen-sparse/drivers/xen/Makefile @@ -3,5 +3,7 @@ obj-y += block/ obj-y += console/ obj-y += evtchn/ -obj-y += net/ +obj-y += netfront/ obj-y += privcmd/ + +obj-$(CONFIG_XEN_PHYSDEV_ACCESS) += netback/ diff --git a/linux-2.6.7-xen-sparse/drivers/xen/net/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/net/Makefile deleted file mode 100644 index 694dd834f0..0000000000 --- a/linux-2.6.7-xen-sparse/drivers/xen/net/Makefile +++ /dev/null @@ -1,2 +0,0 @@ - -obj-y := network.o diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile new file mode 100644 index 0000000000..3279442145 --- /dev/null +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile @@ -0,0 +1,2 @@ + +obj-y := netback.o control.o interface.o diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/common.h b/linux-2.6.7-xen-sparse/drivers/xen/netback/common.h new file mode 100644 index 0000000000..b977dfff39 --- /dev/null +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/common.h @@ -0,0 +1,97 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/common.h + */ + +#ifndef __NETIF__BACKEND__COMMON_H__ +#define __NETIF__BACKEND__COMMON_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define ASSERT(_p) \ + if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) +#else +#define ASSERT(_p) ((void)0) +#define DPRINTK(_f, _a...) ((void)0) +#endif + +typedef struct netif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + + /* Physical parameters of the comms window. */ + unsigned long tx_shmem_frame; + unsigned long rx_shmem_frame; + unsigned int evtchn; + int irq; + + /* The shared rings and indexes. */ + netif_tx_interface_t *tx; + netif_rx_interface_t *rx; + + /* Private indexes into shared ring. */ + NETIF_RING_IDX rx_req_cons; + NETIF_RING_IDX rx_resp_prod; /* private version of shared variable */ + NETIF_RING_IDX tx_req_cons; + NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */ + + /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */ + unsigned long credit_bytes; + unsigned long credit_usec; + unsigned long remaining_credit; + struct timer_list credit_timeout; + + /* Miscellaneous private stuff. */ + enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; + /* + * DISCONNECT response is deferred until pending requests are ack'ed. + * We therefore need to store the id from the original request. + */ + u8 disconnect_rspid; + struct netif_st *hash_next; + struct list_head list; /* scheduling list */ + atomic_t refcnt; + spinlock_t rx_lock, tx_lock; + struct net_device *dev; + struct net_device_stats stats; +} netif_t; + +void netif_create(netif_be_create_t *create); +void netif_destroy(netif_be_destroy_t *destroy); +void netif_connect(netif_be_connect_t *connect); +int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id); +void __netif_disconnect_complete(netif_t *netif); +netif_t *netif_find_by_handle(domid_t domid, unsigned int handle); +#define netif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define netif_put(_b) \ + do { \ + if ( atomic_dec_and_test(&(_b)->refcnt) ) \ + __netif_disconnect_complete(_b); \ + } while (0) + +void netif_interface_init(void); +void netif_ctrlif_init(void); + +void netif_deschedule(netif_t *netif); + +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev); +struct net_device_stats *netif_be_get_stats(struct net_device *dev); +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs); + +#endif /* __NETIF__BACKEND__COMMON_H__ */ diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/control.c b/linux-2.6.7-xen-sparse/drivers/xen/netback/control.c new file mode 100644 index 0000000000..cf1b075031 --- /dev/null +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/control.c @@ -0,0 +1,65 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/control.c + * + * Routines for interfacing with the control plane. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" + +static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + switch ( msg->subtype ) + { + case CMSG_NETIF_BE_CREATE: + if ( msg->length != sizeof(netif_be_create_t) ) + goto parse_error; + netif_create((netif_be_create_t *)&msg->msg[0]); + break; + case CMSG_NETIF_BE_DESTROY: + if ( msg->length != sizeof(netif_be_destroy_t) ) + goto parse_error; + netif_destroy((netif_be_destroy_t *)&msg->msg[0]); + break; + case CMSG_NETIF_BE_CONNECT: + if ( msg->length != sizeof(netif_be_connect_t) ) + goto parse_error; + netif_connect((netif_be_connect_t *)&msg->msg[0]); + break; + case CMSG_NETIF_BE_DISCONNECT: + if ( msg->length != sizeof(netif_be_disconnect_t) ) + goto parse_error; + if ( !netif_disconnect((netif_be_disconnect_t *)&msg->msg[0],msg->id) ) + return; /* Sending the response is deferred until later. */ + break; + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + + parse_error: + DPRINTK("Parse error while reading message subtype %d, len %d\n", + msg->subtype, msg->length); + msg->length = 0; + ctrl_if_send_response(msg); +} + +void netif_ctrlif_init(void) +{ + ctrl_msg_t cmsg; + netif_be_driver_status_changed_t st; + + (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); + + /* Send a driver-UP notification to the domain controller. */ + cmsg.type = CMSG_NETIF_BE; + cmsg.subtype = CMSG_NETIF_BE_DRIVER_STATUS_CHANGED; + cmsg.length = sizeof(netif_be_driver_status_changed_t); + st.status = NETIF_DRIVER_STATUS_UP; + memcpy(cmsg.msg, &st, sizeof(st)); + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +} diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c b/linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c new file mode 100644 index 0000000000..01447b1de8 --- /dev/null +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c @@ -0,0 +1,288 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/interface.c + * + * Network-device interface management. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#define VMALLOC_VMADDR(x) ((unsigned long)(x)) +#endif + +#define NETIF_HASHSZ 1024 +#define NETIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(NETIF_HASHSZ-1)) + +static netif_t *netif_hash[NETIF_HASHSZ]; + +netif_t *netif_find_by_handle(domid_t domid, unsigned int handle) +{ + netif_t *netif = netif_hash[NETIF_HASH(domid, handle)]; + while ( (netif != NULL) && + ((netif->domid != domid) || (netif->handle != handle)) ) + netif = netif->hash_next; + return netif; +} + +void __netif_disconnect_complete(netif_t *netif) +{ + ctrl_msg_t cmsg; + netif_be_disconnect_t disc; + + /* + * These can't be done in __netif_disconnect() because at that point there + * may be outstanding requests at the disc whose asynchronous responses + * must still be notified to the remote driver. + */ + unbind_evtchn_from_irq(netif->evtchn); + vfree(netif->tx); /* Frees netif->rx as well. */ + rtnl_lock(); + (void)dev_close(netif->dev); + rtnl_unlock(); + + /* Construct the deferred response message. */ + cmsg.type = CMSG_NETIF_BE; + cmsg.subtype = CMSG_NETIF_BE_DISCONNECT; + cmsg.id = netif->disconnect_rspid; + cmsg.length = sizeof(netif_be_disconnect_t); + disc.domid = netif->domid; + disc.netif_handle = netif->handle; + disc.status = NETIF_BE_STATUS_OKAY; + memcpy(cmsg.msg, &disc, sizeof(disc)); + + /* + * Make sure message is constructed /before/ status change, because + * after the status change the 'netif' structure could be deallocated at + * any time. Also make sure we send the response /after/ status change, + * as otherwise a subsequent CONNECT request could spuriously fail if + * another CPU doesn't see the status change yet. + */ + mb(); + if ( netif->status != DISCONNECTING ) + BUG(); + netif->status = DISCONNECTED; + mb(); + + /* Send the successful response. */ + ctrl_if_send_response(&cmsg); +} + +void netif_create(netif_be_create_t *create) +{ + int err = 0; + domid_t domid = create->domid; + unsigned int handle = create->netif_handle; + struct net_device *dev; + netif_t **pnetif, *netif; + char name[IFNAMSIZ] = {}; + + snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle); + dev = alloc_netdev(sizeof(netif_t), name, ether_setup); + if ( dev == NULL ) + { + DPRINTK("Could not create netif: out of memory\n"); + create->status = NETIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + netif = dev->priv; + memset(netif, 0, sizeof(*netif)); + netif->domid = domid; + netif->handle = handle; + netif->status = DISCONNECTED; + spin_lock_init(&netif->rx_lock); + spin_lock_init(&netif->tx_lock); + atomic_set(&netif->refcnt, 0); + netif->dev = dev; + + netif->credit_bytes = netif->remaining_credit = ~0UL; + netif->credit_usec = 0UL; + /*init_ac_timer(&new_vif->credit_timeout);*/ + + pnetif = &netif_hash[NETIF_HASH(domid, handle)]; + while ( *pnetif != NULL ) + { + if ( ((*pnetif)->domid == domid) && ((*pnetif)->handle == handle) ) + { + DPRINTK("Could not create netif: already exists\n"); + create->status = NETIF_BE_STATUS_INTERFACE_EXISTS; + kfree(dev); + return; + } + pnetif = &(*pnetif)->hash_next; + } + + dev->hard_start_xmit = netif_be_start_xmit; + dev->get_stats = netif_be_get_stats; + memcpy(dev->dev_addr, create->mac, ETH_ALEN); + + /* Disable queuing. */ + dev->tx_queue_len = 0; + + /* Force a different MAC from remote end. */ + dev->dev_addr[2] ^= 1; + + if ( (err = register_netdev(dev)) != 0 ) + { + DPRINTK("Could not register new net device %s: err=%d\n", + dev->name, err); + create->status = NETIF_BE_STATUS_OUT_OF_MEMORY; + kfree(dev); + return; + } + + netif->hash_next = *pnetif; + *pnetif = netif; + + DPRINTK("Successfully created netif\n"); + create->status = NETIF_BE_STATUS_OKAY; +} + +void netif_destroy(netif_be_destroy_t *destroy) +{ + domid_t domid = destroy->domid; + unsigned int handle = destroy->netif_handle; + netif_t **pnetif, *netif; + + pnetif = &netif_hash[NETIF_HASH(domid, handle)]; + while ( (netif = *pnetif) != NULL ) + { + if ( (netif->domid == domid) && (netif->handle == handle) ) + { + if ( netif->status != DISCONNECTED ) + goto still_connected; + goto destroy; + } + pnetif = &netif->hash_next; + } + + destroy->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + + still_connected: + destroy->status = NETIF_BE_STATUS_INTERFACE_CONNECTED; + return; + + destroy: + *pnetif = netif->hash_next; + unregister_netdev(netif->dev); + kfree(netif->dev); + destroy->status = NETIF_BE_STATUS_OKAY; +} + +void netif_connect(netif_be_connect_t *connect) +{ + domid_t domid = connect->domid; + unsigned int handle = connect->netif_handle; + unsigned int evtchn = connect->evtchn; + unsigned long tx_shmem_frame = connect->tx_shmem_frame; + unsigned long rx_shmem_frame = connect->rx_shmem_frame; + struct vm_struct *vma; + pgprot_t prot; + int error; + netif_t *netif; +#if 0 + struct net_device *eth0_dev; +#endif + + netif = netif_find_by_handle(domid, handle); + if ( unlikely(netif == NULL) ) + { + DPRINTK("netif_connect attempted for non-existent netif (%u,%u)\n", + connect->domid, connect->netif_handle); + connect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + if ( netif->status != DISCONNECTED ) + { + connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED; + return; + } + + if ( (vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP)) == NULL ) + { + connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED); + error = direct_remap_area_pages(&init_mm, + VMALLOC_VMADDR(vma->addr), + tx_shmem_frame<addr) + PAGE_SIZE, + rx_shmem_frame<status = NETIF_BE_STATUS_OUT_OF_MEMORY; + else if ( error == -EFAULT ) + connect->status = NETIF_BE_STATUS_MAPPING_ERROR; + else + connect->status = NETIF_BE_STATUS_ERROR; + vfree(vma->addr); + return; + } + + netif->evtchn = evtchn; + netif->irq = bind_evtchn_to_irq(evtchn); + netif->tx_shmem_frame = tx_shmem_frame; + netif->rx_shmem_frame = rx_shmem_frame; + netif->tx = + (netif_tx_interface_t *)vma->addr; + netif->rx = + (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE); + netif->status = CONNECTED; + netif_get(netif); + + rtnl_lock(); + (void)dev_open(netif->dev); + rtnl_unlock(); + + (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif); + netif_start_queue(netif->dev); + + connect->status = NETIF_BE_STATUS_OKAY; +} + +int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id) +{ + domid_t domid = disconnect->domid; + unsigned int handle = disconnect->netif_handle; + netif_t *netif; + + netif = netif_find_by_handle(domid, handle); + if ( unlikely(netif == NULL) ) + { + DPRINTK("netif_disconnect attempted for non-existent netif" + " (%u,%u)\n", disconnect->domid, disconnect->netif_handle); + disconnect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; + return 1; /* Caller will send response error message. */ + } + + if ( netif->status == CONNECTED ) + { + netif->status = DISCONNECTING; + netif->disconnect_rspid = rsp_id; + wmb(); /* Let other CPUs see the status change. */ + netif_stop_queue(netif->dev); + free_irq(netif->irq, netif); + netif_deschedule(netif); + netif_put(netif); + return 0; /* Caller should not send response message. */ + } + + disconnect->status = NETIF_BE_STATUS_OKAY; + return 1; +} + +void netif_interface_init(void) +{ + memset(netif_hash, 0, sizeof(netif_hash)); +} diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c b/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c new file mode 100644 index 0000000000..c0381048d8 --- /dev/null +++ b/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c @@ -0,0 +1,778 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/main.c + * + * Back-end of the driver for virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * arch/xen/drivers/netif/frontend + * + * Copyright (c) 2002-2004, K A Fraser + */ + +#include "common.h" + +static void netif_page_release(struct page *page); +static void make_tx_response(netif_t *netif, + u16 id, + s8 st); +static int make_rx_response(netif_t *netif, + u16 id, + s8 st, + memory_t addr, + u16 size); + +static void net_tx_action(unsigned long unused); +static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0); + +static void net_rx_action(unsigned long unused); +static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0); + +static struct sk_buff_head rx_queue; +static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2]; +static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE*3]; +static unsigned char rx_notify[NR_EVENT_CHANNELS]; + +/* Don't currently gate addition of an interface to the tx scheduling list. */ +#define tx_work_exists(_if) (1) + +#define MAX_PENDING_REQS 256 +static unsigned long mmap_vstart; +#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) + +#define PKT_PROT_LEN (ETH_HLEN + 20) + +static struct { + netif_tx_request_t req; + netif_t *netif; +} pending_tx_info[MAX_PENDING_REQS]; +static u16 pending_ring[MAX_PENDING_REQS]; +typedef unsigned int PEND_RING_IDX; +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) +static PEND_RING_IDX pending_prod, pending_cons; +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +/* Freed TX SKBs get batched on this ring before return to pending_ring. */ +static u16 dealloc_ring[MAX_PENDING_REQS]; +static spinlock_t dealloc_lock = SPIN_LOCK_UNLOCKED; +static PEND_RING_IDX dealloc_prod, dealloc_cons; + +static struct sk_buff_head tx_queue; +static multicall_entry_t tx_mcl[MAX_PENDING_REQS]; + +static struct list_head net_schedule_list; +static spinlock_t net_schedule_list_lock; + +#define MAX_MFN_ALLOC 64 +static unsigned long mfn_list[MAX_MFN_ALLOC]; +static unsigned int alloc_index = 0; +static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED; + +static void __refresh_mfn_list(void) +{ + int ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation, + mfn_list, MAX_MFN_ALLOC); + if ( unlikely(ret != MAX_MFN_ALLOC) ) + BUG(); + alloc_index = MAX_MFN_ALLOC; +} + +static unsigned long get_new_mfn(void) +{ + unsigned long mfn, flags; + spin_lock_irqsave(&mfn_lock, flags); + if ( alloc_index == 0 ) + __refresh_mfn_list(); + mfn = mfn_list[--alloc_index]; + spin_unlock_irqrestore(&mfn_lock, flags); + return mfn; +} + +static void dealloc_mfn(unsigned long mfn) +{ + unsigned long flags; + spin_lock_irqsave(&mfn_lock, flags); + if ( alloc_index != MAX_MFN_ALLOC ) + mfn_list[alloc_index++] = mfn; + else if ( HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, &mfn, 1) != 1 ) + BUG(); + spin_unlock_irqrestore(&mfn_lock, flags); +} + +static inline void maybe_schedule_tx_action(void) +{ + smp_mb(); + if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&net_schedule_list) ) + tasklet_schedule(&net_tx_tasklet); +} + +/* + * This is the primary RECEIVE function for a network interface. + * Note that, from the p.o.v. of /this/ OS it looks like a transmit. + */ +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + netif_t *netif = (netif_t *)dev->priv; + + ASSERT(skb->dev == dev); + + /* Drop the packet if the target domain has no receive buffers. */ + if ( (netif->rx_req_cons == netif->rx->req_prod) || + ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) ) + goto drop; + + /* + * We do not copy the packet unless: + * 1. The data is shared; or + * 2. It spans a page boundary; or + * 3. We cannot be sure the whole data page is allocated. + * The copying method is taken from skb_copy(). + * NB. We also couldn't cope with fragmented packets, but we won't get + * any because we not advertise the NETIF_F_SG feature. + */ + if ( skb_shared(skb) || skb_cloned(skb) || + (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) || + ((skb->end - skb->head) < (PAGE_SIZE/2)) ) + { + struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC); + int hlen = skb->data - skb->head; + if ( unlikely(nskb == NULL) ) + goto drop; + skb_reserve(nskb, hlen); + __skb_put(nskb, skb->len); + (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len); + nskb->dev = skb->dev; + dev_kfree_skb(skb); + skb = nskb; + } + + netif->rx_req_cons++; + + skb_queue_tail(&rx_queue, skb); + tasklet_schedule(&net_rx_tasklet); + + return 0; + + drop: + netif->stats.rx_dropped++; + dev_kfree_skb(skb); + return 0; +} + +#if 0 +static void xen_network_done_notify(void) +{ + static struct net_device *eth0_dev = NULL; + if ( unlikely(eth0_dev == NULL) ) + eth0_dev = __dev_get_by_name("eth0"); + netif_rx_schedule(eth0_dev); +} +/* + * Add following to poll() function in NAPI driver (Tigon3 is example): + * if ( xen_network_done() ) + * tg3_enable_ints(tp); + */ +int xen_network_done(void) +{ + return skb_queue_empty(&rx_queue); +} +#endif + +static void net_rx_action(unsigned long unused) +{ + netif_t *netif; + s8 status; + u16 size, id, evtchn; + mmu_update_t *mmu; + multicall_entry_t *mcl; + unsigned long vdata, mdata, new_mfn; + struct sk_buff_head rxq; + struct sk_buff *skb; + u16 notify_list[NETIF_RX_RING_SIZE]; + int notify_nr = 0; + + skb_queue_head_init(&rxq); + + mcl = rx_mcl; + mmu = rx_mmu; + while ( (skb = skb_dequeue(&rx_queue)) != NULL ) + { + netif = (netif_t *)skb->dev->priv; + vdata = (unsigned long)skb->data; + mdata = virt_to_machine(vdata); + new_mfn = get_new_mfn(); + + mmu[0].ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + mmu[0].val = __pa(vdata) >> PAGE_SHIFT; + mmu[1].val = (unsigned long)(netif->domid<<16) & ~0xFFFFUL; + mmu[1].ptr = (unsigned long)(netif->domid<< 0) & ~0xFFFFUL; + mmu[1].ptr |= MMU_EXTENDED_COMMAND; + mmu[1].val |= MMUEXT_SET_SUBJECTDOM; + mmu[2].ptr = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND; + mmu[2].val = MMUEXT_REASSIGN_PAGE; + + mcl[0].op = __HYPERVISOR_update_va_mapping; + mcl[0].args[0] = vdata >> PAGE_SHIFT; + mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL; + mcl[0].args[2] = 0; + mcl[1].op = __HYPERVISOR_mmu_update; + mcl[1].args[0] = (unsigned long)mmu; + mcl[1].args[1] = 3; + mcl[1].args[2] = 0; + + mcl += 2; + mmu += 3; + + __skb_queue_tail(&rxq, skb); + + /* Filled the batch queue? */ + if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) ) + break; + } + + if ( mcl == rx_mcl ) + return; + + mcl[-2].args[2] = UVMF_FLUSH_TLB; + if ( unlikely(HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl) != 0) ) + BUG(); + + mcl = rx_mcl; + mmu = rx_mmu; + while ( (skb = __skb_dequeue(&rxq)) != NULL ) + { + netif = (netif_t *)skb->dev->priv; + size = skb->tail - skb->data; + + /* Rederive the machine addresses. */ + new_mfn = mcl[0].args[1] >> PAGE_SHIFT; + mdata = ((mmu[2].ptr & PAGE_MASK) | + ((unsigned long)skb->data & ~PAGE_MASK)); + + phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn; + + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; + + netif->stats.rx_bytes += size; + netif->stats.rx_packets++; + + /* The update_va_mapping() must not fail. */ + if ( unlikely(mcl[0].args[5] != 0) ) + BUG(); + + /* Check the reassignment error code. */ + status = NETIF_RSP_OKAY; + if ( unlikely(mcl[1].args[5] != 0) ) + { + DPRINTK("Failed MMU update transferring to DOM%u\n", netif->domid); + dealloc_mfn(mdata >> PAGE_SHIFT); + status = NETIF_RSP_ERROR; + } + + evtchn = netif->evtchn; + id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_resp_prod)].req.id; + if ( make_rx_response(netif, id, status, mdata, size) && + (rx_notify[evtchn] == 0) ) + { + rx_notify[evtchn] = 1; + notify_list[notify_nr++] = evtchn; + } + + dev_kfree_skb(skb); + + mcl += 2; + mmu += 3; + } + + while ( notify_nr != 0 ) + { + evtchn = notify_list[--notify_nr]; + rx_notify[evtchn] = 0; + notify_via_evtchn(evtchn); + } + + /* More work to do? */ + if ( !skb_queue_empty(&rx_queue) ) + tasklet_schedule(&net_rx_tasklet); +#if 0 + else + xen_network_done_notify(); +#endif +} + +struct net_device_stats *netif_be_get_stats(struct net_device *dev) +{ + netif_t *netif = dev->priv; + return &netif->stats; +} + +static int __on_net_schedule_list(netif_t *netif) +{ + return netif->list.next != NULL; +} + +static void remove_from_net_schedule_list(netif_t *netif) +{ + spin_lock_irq(&net_schedule_list_lock); + if ( likely(__on_net_schedule_list(netif)) ) + { + list_del(&netif->list); + netif->list.next = NULL; + netif_put(netif); + } + spin_unlock_irq(&net_schedule_list_lock); +} + +static void add_to_net_schedule_list_tail(netif_t *netif) +{ + if ( __on_net_schedule_list(netif) ) + return; + + spin_lock_irq(&net_schedule_list_lock); + if ( !__on_net_schedule_list(netif) && (netif->status == CONNECTED) ) + { + list_add_tail(&netif->list, &net_schedule_list); + netif_get(netif); + } + spin_unlock_irq(&net_schedule_list_lock); +} + +static inline void netif_schedule_work(netif_t *netif) +{ + if ( (netif->tx_req_cons != netif->tx->req_prod) && + ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) + { + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } +} + +void netif_deschedule(netif_t *netif) +{ + remove_from_net_schedule_list(netif); +} + +#if 0 +static void tx_credit_callback(unsigned long data) +{ + netif_t *netif = (netif_t *)data; + netif->remaining_credit = netif->credit_bytes; + netif_schedule_work(netif); +} +#endif + +static void net_tx_action(unsigned long unused) +{ + struct list_head *ent; + struct sk_buff *skb; + netif_t *netif; + netif_tx_request_t txreq; + u16 pending_idx; + NETIF_RING_IDX i; + struct page *page; + multicall_entry_t *mcl; + PEND_RING_IDX dc, dp; + + if ( (dc = dealloc_cons) == (dp = dealloc_prod) ) + goto skip_dealloc; + + mcl = tx_mcl; + while ( dc != dp ) + { + pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; + mcl[0].op = __HYPERVISOR_update_va_mapping; + mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT; + mcl[0].args[1] = 0; + mcl[0].args[2] = 0; + mcl++; + } + + mcl[-1].args[2] = UVMF_FLUSH_TLB; + if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) ) + BUG(); + + mcl = tx_mcl; + while ( dealloc_cons != dp ) + { + /* The update_va_mapping() must not fail. */ + if ( unlikely(mcl[0].args[5] != 0) ) + BUG(); + + pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)]; + + netif = pending_tx_info[pending_idx].netif; + + spin_lock(&netif->tx_lock); + make_tx_response(netif, pending_tx_info[pending_idx].req.id, + NETIF_RSP_OKAY); + spin_unlock(&netif->tx_lock); + + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + + /* + * Scheduling checks must happen after the above response is posted. + * This avoids a possible race with a guest OS on another CPU. + */ + mb(); + if ( (netif->tx_req_cons != netif->tx->req_prod) && + ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) + add_to_net_schedule_list_tail(netif); + + netif_put(netif); + + mcl++; + } + + skip_dealloc: + mcl = tx_mcl; + while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && + !list_empty(&net_schedule_list) ) + { + /* Get a netif from the list with work to do. */ + ent = net_schedule_list.next; + netif = list_entry(ent, netif_t, list); + netif_get(netif); + remove_from_net_schedule_list(netif); + + /* Work to do? */ + i = netif->tx_req_cons; + if ( (i == netif->tx->req_prod) || + ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) ) + { + netif_put(netif); + continue; + } + memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, + sizeof(txreq)); + netif->tx_req_cons++; + +#if 0 + /* Credit-based scheduling. */ + if ( tx.size > netif->remaining_credit ) + { + s_time_t now = NOW(), next_credit = + netif->credit_timeout.expires + MICROSECS(netif->credit_usec); + if ( next_credit <= now ) + { + netif->credit_timeout.expires = now; + netif->remaining_credit = netif->credit_bytes; + } + else + { + netif->remaining_credit = 0; + netif->credit_timeout.expires = next_credit; + netif->credit_timeout.data = (unsigned long)netif; + netif->credit_timeout.function = tx_credit_callback; + netif->credit_timeout.cpu = smp_processor_id(); + add_ac_timer(&netif->credit_timeout); + break; + } + } + netif->remaining_credit -= tx.size; +#endif + + netif_schedule_work(netif); + + if ( unlikely(txreq.size <= PKT_PROT_LEN) || + unlikely(txreq.size > ETH_FRAME_LEN) ) + { + DPRINTK("Bad packet size: %d\n", txreq.size); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + continue; + } + + /* No crossing a page boundary as the payload mustn't fragment. */ + if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) ) + { + DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", + txreq.addr, txreq.size, + (txreq.addr &~PAGE_MASK) + txreq.size); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + continue; + } + + pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + + if ( unlikely((skb = alloc_skb(PKT_PROT_LEN+16, GFP_ATOMIC)) == NULL) ) + { + DPRINTK("Can't allocate a skb in start_xmit.\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + break; + } + + /* Packets passed to netif_rx() must have some headroom. */ + skb_reserve(skb, 16); + + mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain; + mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT; + mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL; + mcl[0].args[2] = 0; + mcl[0].args[3] = netif->domid; + mcl++; + + memcpy(&pending_tx_info[pending_idx].req, &txreq, sizeof(txreq)); + pending_tx_info[pending_idx].netif = netif; + *((u16 *)skb->data) = pending_idx; + + __skb_queue_tail(&tx_queue, skb); + + pending_cons++; + + /* Filled the batch queue? */ + if ( (mcl - tx_mcl) == ARRAY_SIZE(tx_mcl) ) + break; + } + + if ( mcl == tx_mcl ) + return; + + if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) ) + BUG(); + + mcl = tx_mcl; + while ( (skb = __skb_dequeue(&tx_queue)) != NULL ) + { + pending_idx = *((u16 *)skb->data); + netif = pending_tx_info[pending_idx].netif; + memcpy(&txreq, &pending_tx_info[pending_idx].req, sizeof(txreq)); + + /* Check the remap error code. */ + if ( unlikely(mcl[0].args[5] != 0) ) + { + DPRINTK("Bad page frame\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + kfree_skb(skb); + mcl++; + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + continue; + } + + phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] = + txreq.addr >> PAGE_SHIFT; + + __skb_put(skb, PKT_PROT_LEN); + memcpy(skb->data, + (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)), + PKT_PROT_LEN); + + page = virt_to_page(MMAP_VADDR(pending_idx)); + + /* Append the packet payload as a fragment. */ + skb_shinfo(skb)->frags[0].page = page; + skb_shinfo(skb)->frags[0].size = txreq.size - PKT_PROT_LEN; + skb_shinfo(skb)->frags[0].page_offset = + (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK; + skb_shinfo(skb)->nr_frags = 1; + skb->data_len = txreq.size - PKT_PROT_LEN; + skb->len += skb->data_len; + + skb->dev = netif->dev; + skb->protocol = eth_type_trans(skb, skb->dev); + + /* + * Destructor information. We hideously abuse the 'mapping' pointer, + * which isn't otherwise used by us. The page deallocator is modified + * to interpret a non-NULL value as a destructor function to be called. + * This works okay because in all other cases the pointer must be NULL + * when the page is freed (normally Linux will explicitly bug out if + * it sees otherwise. + */ + page->mapping = (struct address_space *)netif_page_release; + set_page_count(page, 1); + + netif->stats.tx_bytes += txreq.size; + netif->stats.tx_packets++; + + netif_rx(skb); + netif->dev->last_rx = jiffies; + + mcl++; + } +} + +static void netif_page_release(struct page *page) +{ + unsigned long flags; + u16 pending_idx = page - virt_to_page(mmap_vstart); + + /* Stop the abuse. */ + page->mapping = NULL; + + spin_lock_irqsave(&dealloc_lock, flags); + dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx; + spin_unlock_irqrestore(&dealloc_lock, flags); + + tasklet_schedule(&net_tx_tasklet); +} + +#if 0 +long flush_bufs_for_netif(netif_t *netif) +{ + NETIF_RING_IDX i; + + /* Return any outstanding receive buffers to the guest OS. */ + spin_lock(&netif->rx_lock); + for ( i = netif->rx_req_cons; + (i != netif->rx->req_prod) && + ((i-netif->rx_resp_prod) != NETIF_RX_RING_SIZE); + i++ ) + { + make_rx_response(netif, + netif->rx->ring[MASK_NETIF_RX_IDX(i)].req.id, + NETIF_RSP_DROPPED, 0, 0); + } + netif->rx_req_cons = i; + spin_unlock(&netif->rx_lock); + + /* + * Flush pending transmit buffers. The guest may still have to wait for + * buffers that are queued at a physical NIC. + */ + spin_lock(&netif->tx_lock); + for ( i = netif->tx_req_cons; + (i != netif->tx->req_prod) && + ((i-netif->tx_resp_prod) != NETIF_TX_RING_SIZE); + i++ ) + { + make_tx_response(netif, + netif->tx->ring[MASK_NETIF_TX_IDX(i)].req.id, + NETIF_RSP_DROPPED); + } + netif->tx_req_cons = i; + spin_unlock(&netif->tx_lock); + + return 0; +} +#endif + +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + netif_t *netif = dev_id; + if ( tx_work_exists(netif) ) + { + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } + return IRQ_HANDLED; +} + +static void make_tx_response(netif_t *netif, + u16 id, + s8 st) +{ + NETIF_RING_IDX i = netif->tx_resp_prod; + netif_tx_response_t *resp; + + resp = &netif->tx->ring[MASK_NETIF_TX_IDX(i)].resp; + resp->id = id; + resp->status = st; + wmb(); + netif->tx->resp_prod = netif->tx_resp_prod = ++i; + + mb(); /* Update producer before checking event threshold. */ + if ( i == netif->tx->event ) + notify_via_evtchn(netif->evtchn); +} + +static int make_rx_response(netif_t *netif, + u16 id, + s8 st, + memory_t addr, + u16 size) +{ + NETIF_RING_IDX i = netif->rx_resp_prod; + netif_rx_response_t *resp; + + resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp; + resp->addr = addr; + resp->id = id; + resp->status = (s16)size; + if ( st < 0 ) + resp->status = (s16)st; + wmb(); + netif->rx->resp_prod = netif->rx_resp_prod = ++i; + + mb(); /* Update producer before checking event threshold. */ + return (i == netif->rx->event); +} + +static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) +{ + struct list_head *ent; + netif_t *netif; + int i = 0; + + printk(KERN_ALERT "netif_schedule_list:\n"); + spin_lock_irq(&net_schedule_list_lock); + + list_for_each ( ent, &net_schedule_list ) + { + netif = list_entry(ent, netif_t, list); + printk(KERN_ALERT " %d: private(rx_req_cons=%08x rx_resp_prod=%08x\n", + i, netif->rx_req_cons, netif->rx_resp_prod); + printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n", + netif->tx_req_cons, netif->tx_resp_prod); + printk(KERN_ALERT " shared(rx_req_prod=%08x rx_resp_prod=%08x\n", + netif->rx->req_prod, netif->rx->resp_prod); + printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n", + netif->rx->event, netif->tx->req_prod); + printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n", + netif->tx->resp_prod, netif->tx->event); + i++; + } + + spin_unlock_irq(&net_schedule_list_lock); + printk(KERN_ALERT " ** End of netif_schedule_list **\n"); + + return IRQ_HANDLED; +} + +static int __init netback_init(void) +{ + int i; + + if ( !(start_info.flags & SIF_NET_BE_DOMAIN) && + !(start_info.flags & SIF_INITDOMAIN) ) + return 0; + + printk("Initialising Xen netif backend\n"); + + skb_queue_head_init(&rx_queue); + skb_queue_head_init(&tx_queue); + + netif_interface_init(); + + if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 ) + BUG(); + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + for ( i = 0; i < MAX_PENDING_REQS; i++ ) + pending_ring[i] = i; + + spin_lock_init(&net_schedule_list_lock); + INIT_LIST_HEAD(&net_schedule_list); + + netif_ctrlif_init(); + + (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG), + netif_be_dbg, SA_SHIRQ, + "net-be-dbg", &netif_be_dbg); + + return 0; +} + +static void netback_cleanup(void) +{ + BUG(); +} + +module_init(netback_init); +module_exit(netback_cleanup); diff --git a/linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig b/linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig similarity index 100% rename from linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig rename to linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile new file mode 100644 index 0000000000..7eb07a2483 --- /dev/null +++ b/linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile @@ -0,0 +1,2 @@ + +obj-y := netfront.o diff --git a/linux-2.6.7-xen-sparse/drivers/xen/net/network.c b/linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c similarity index 100% rename from linux-2.6.7-xen-sparse/drivers/xen/net/network.c rename to linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c diff --git a/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/hypervisor.h b/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/hypervisor.h index ff65b7eb41..49bf0975ec 100644 --- a/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/hypervisor.h +++ b/linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/hypervisor.h @@ -151,6 +151,12 @@ static inline int flush_page_update_queue(void) #define xen_flush_page_update_queue() (_flush_page_update_queue()) void MULTICALL_flush_page_update_queue(void); +#ifdef CONFIG_XEN_PHYSDEV_ACCESS +/* Allocate a contiguous empty region of low memory. Return virtual start. */ +unsigned long allocate_empty_lowmem_region(unsigned long pages); +/* Deallocate a contiguous region of low memory. Return it to the allocator. */ +void deallocate_lowmem_region(unsigned long vstart, unsigned long pages); +#endif /* * Assembler stubs for hyper-calls. @@ -389,9 +395,12 @@ static inline int HYPERVISOR_update_va_mapping( "b" (page_nr), "c" ((new_val).pte_low), "d" (flags) : "memory" ); if ( unlikely(ret < 0) ) - panic("Failed update VA mapping: %08lx, %08lx, %08lx", - page_nr, (new_val).pte_low, flags); - + { + printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n", + page_nr, (new_val).pte_low, flags); + BUG(); + } + return ret; } diff --git a/linux-2.6.7-xen-sparse/mm/page_alloc.c b/linux-2.6.7-xen-sparse/mm/page_alloc.c new file mode 100644 index 0000000000..5d9b765d39 --- /dev/null +++ b/linux-2.6.7-xen-sparse/mm/page_alloc.c @@ -0,0 +1,2017 @@ +/* + * linux/mm/page_alloc.c + * + * Manages the free list, the system allocates free pages here. + * Note that kmalloc() lives in slab.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 + * (lots of bits borrowed from Ingo Molnar & Andrew Morton) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +DECLARE_BITMAP(node_online_map, MAX_NUMNODES); +struct pglist_data *pgdat_list; +unsigned long totalram_pages; +unsigned long totalhigh_pages; +int nr_swap_pages; +int numnodes = 1; +int sysctl_lower_zone_protection = 0; + +EXPORT_SYMBOL(totalram_pages); +EXPORT_SYMBOL(nr_swap_pages); + +/* + * Used by page_zone() to look up the address of the struct zone whose + * id is encoded in the upper bits of page->flags + */ +struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; +EXPORT_SYMBOL(zone_table); + +static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; +int min_free_kbytes = 1024; + +/* + * Temporary debugging check for pages not lying within a given zone. + */ +static int bad_range(struct zone *zone, struct page *page) +{ + if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) + return 1; + if (page_to_pfn(page) < zone->zone_start_pfn) + return 1; + if (zone != page_zone(page)) + return 1; + return 0; +} + +static void bad_page(const char *function, struct page *page) +{ + printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", + function, current->comm, page); + printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n", + (unsigned long)page->flags, page->mapping, + (int)page->mapcount, page_count(page)); + printk(KERN_EMERG "Backtrace:\n"); + dump_stack(); + printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); + page->flags &= ~(1 << PG_private | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_maplock | + 1 << PG_anon | + 1 << PG_swapcache | + 1 << PG_writeback); + set_page_count(page, 0); + page->mapping = NULL; + page->mapcount = 0; +} + +#ifndef CONFIG_HUGETLB_PAGE +#define prep_compound_page(page, order) do { } while (0) +#define destroy_compound_page(page, order) do { } while (0) +#else +/* + * Higher-order pages are called "compound pages". They are structured thusly: + * + * The first PAGE_SIZE page is called the "head page". + * + * The remaining PAGE_SIZE pages are called "tail pages". + * + * All pages have PG_compound set. All pages have their ->private pointing at + * the head page (even the head page has this). + * + * The first tail page's ->mapping, if non-zero, holds the address of the + * compound page's put_page() function. + * + * The order of the allocation is stored in the first tail page's ->index + * This is only for debug at present. This usage means that zero-order pages + * may not be compound. + */ +static void prep_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + page[1].mapping = 0; + page[1].index = order; + for (i = 0; i < nr_pages; i++) { + struct page *p = page + i; + + SetPageCompound(p); + p->private = (unsigned long)page; + } +} + +static void destroy_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + if (!PageCompound(page)) + return; + + if (page[1].index != order) + bad_page(__FUNCTION__, page); + + for (i = 0; i < nr_pages; i++) { + struct page *p = page + i; + + if (!PageCompound(p)) + bad_page(__FUNCTION__, page); + if (p->private != (unsigned long)page) + bad_page(__FUNCTION__, page); + ClearPageCompound(p); + } +} +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Freeing function for a buddy system allocator. + * + * The concept of a buddy system is to maintain direct-mapped table + * (containing bit values) for memory blocks of various "orders". + * The bottom level table contains the map for the smallest allocatable + * units of memory (here, pages), and each level above it describes + * pairs of units from the levels below, hence, "buddies". + * At a high level, all that happens here is marking the table entry + * at the bottom level available, and propagating the changes upward + * as necessary, plus some accounting needed to play nicely with other + * parts of the VM system. + * At each level, we keep one bit for each pair of blocks, which + * is set to 1 iff only one of the pair is allocated. So when we + * are allocating or freeing one, we can derive the state of the + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. + * If a block is freed, and its buddy is also free, then this + * triggers coalescing into a block of larger size. + * + * -- wli + */ + +static inline void __free_pages_bulk (struct page *page, struct page *base, + struct zone *zone, struct free_area *area, unsigned long mask, + unsigned int order) +{ + unsigned long page_idx, index; + + if (order) + destroy_compound_page(page, order); + page_idx = page - base; + if (page_idx & ~mask) + BUG(); + index = page_idx >> (1 + order); + + zone->free_pages -= mask; + while (mask + (1 << (MAX_ORDER-1))) { + struct page *buddy1, *buddy2; + + BUG_ON(area >= zone->free_area + MAX_ORDER); + if (!__test_and_change_bit(index, area->map)) + /* + * the buddy page is still allocated. + */ + break; + /* + * Move the buddy up one level. + * This code is taking advantage of the identity: + * -mask = 1+~mask + */ + buddy1 = base + (page_idx ^ -mask); + buddy2 = base + page_idx; + BUG_ON(bad_range(zone, buddy1)); + BUG_ON(bad_range(zone, buddy2)); + list_del(&buddy1->lru); + mask <<= 1; + area++; + index >>= 1; + page_idx &= mask; + } + list_add(&(base + page_idx)->lru, &area->free_list); +} + +static inline void free_pages_check(const char *function, struct page *page) +{ + if ( page_mapped(page) || + page->mapping != NULL || + page_count(page) != 0 || + (page->flags & ( + 1 << PG_lru | + 1 << PG_private | + 1 << PG_locked | + 1 << PG_active | + 1 << PG_reclaim | + 1 << PG_slab | + 1 << PG_maplock | + 1 << PG_anon | + 1 << PG_swapcache | + 1 << PG_writeback ))) + bad_page(function, page); + if (PageDirty(page)) + ClearPageDirty(page); +} + +/* + * Frees a list of pages. + * Assumes all pages on list are in same zone, and of same order. + * count is the number of pages to free, or 0 for all on the list. + * + * If the zone was previously in an "all pages pinned" state then look to + * see if this freeing clears that state. + * + * And clear the zone's pages_scanned counter, to hold off the "all pages are + * pinned" detection logic. + */ +static int +free_pages_bulk(struct zone *zone, int count, + struct list_head *list, unsigned int order) +{ + unsigned long mask, flags; + struct free_area *area; + struct page *base, *page = NULL; + int ret = 0; + + mask = (~0UL) << order; + base = zone->zone_mem_map; + area = zone->free_area + order; + spin_lock_irqsave(&zone->lock, flags); + zone->all_unreclaimable = 0; + zone->pages_scanned = 0; + while (!list_empty(list) && count--) { + page = list_entry(list->prev, struct page, lru); + /* have to delete it as __free_pages_bulk list manipulates */ + list_del(&page->lru); + __free_pages_bulk(page, base, zone, area, mask, order); + ret++; + } + spin_unlock_irqrestore(&zone->lock, flags); + return ret; +} + +void __free_pages_ok(struct page *page, unsigned int order) +{ + LIST_HEAD(list); + int i; + + mod_page_state(pgfree, 1 << order); + for (i = 0 ; i < (1 << order) ; ++i) + free_pages_check(__FUNCTION__, page + i); + list_add(&page->lru, &list); + kernel_map_pages(page, 1<> (1+(order)), (area)->map) + +static inline struct page * +expand(struct zone *zone, struct page *page, + unsigned long index, int low, int high, struct free_area *area) +{ + unsigned long size = 1 << high; + + while (high > low) { + BUG_ON(bad_range(zone, page)); + area--; + high--; + size >>= 1; + list_add(&page->lru, &area->free_list); + MARK_USED(index, high, area); + index += size; + page += size; + } + return page; +} + +static inline void set_page_refs(struct page *page, int order) +{ +#ifdef CONFIG_MMU + set_page_count(page, 1); +#else + int i; + + /* + * We need to reference all the pages for this order, otherwise if + * anyone accesses one of the pages with (get/put) it will be freed. + */ + for (i = 0; i < (1 << order); i++) + set_page_count(page+i, 1); +#endif /* CONFIG_MMU */ +} + +/* + * This page is about to be returned from the page allocator + */ +static void prep_new_page(struct page *page, int order) +{ + if (page->mapping || page_mapped(page) || + (page->flags & ( + 1 << PG_private | + 1 << PG_locked | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_maplock | + 1 << PG_anon | + 1 << PG_swapcache | + 1 << PG_writeback ))) + bad_page(__FUNCTION__, page); + + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | + 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_checked | 1 << PG_mappedtodisk); + page->private = 0; + set_page_refs(page, order); +} + +/* + * Do the hard work of removing an element from the buddy allocator. + * Call me with the zone->lock already held. + */ +static struct page *__rmqueue(struct zone *zone, unsigned int order) +{ + struct free_area * area; + unsigned int current_order; + struct page *page; + unsigned int index; + + for (current_order = order; current_order < MAX_ORDER; ++current_order) { + area = zone->free_area + current_order; + if (list_empty(&area->free_list)) + continue; + + page = list_entry(area->free_list.next, struct page, lru); + list_del(&page->lru); + index = page - zone->zone_mem_map; + if (current_order != MAX_ORDER-1) + MARK_USED(index, current_order, area); + zone->free_pages -= 1UL << order; + return expand(zone, page, index, order, current_order, area); + } + + return NULL; +} + +/* + * Obtain a specified number of elements from the buddy allocator, all under + * a single hold of the lock, for efficiency. Add them to the supplied list. + * Returns the number of new pages which were placed at *list. + */ +static int rmqueue_bulk(struct zone *zone, unsigned int order, + unsigned long count, struct list_head *list) +{ + unsigned long flags; + int i; + int allocated = 0; + struct page *page; + + spin_lock_irqsave(&zone->lock, flags); + for (i = 0; i < count; ++i) { + page = __rmqueue(zone, order); + if (page == NULL) + break; + allocated++; + list_add_tail(&page->lru, list); + } + spin_unlock_irqrestore(&zone->lock, flags); + return allocated; +} + +#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) +static void __drain_pages(unsigned int cpu) +{ + struct zone *zone; + int i; + + for_each_zone(zone) { + struct per_cpu_pageset *pset; + + pset = &zone->pageset[cpu]; + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &pset->pcp[i]; + pcp->count -= free_pages_bulk(zone, pcp->count, + &pcp->list, 0); + } + } +} +#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_PM +int is_head_of_free_region(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long flags; + int order; + struct list_head *curr; + + /* + * Should not matter as we need quiescent system for + * suspend anyway, but... + */ + spin_lock_irqsave(&zone->lock, flags); + for (order = MAX_ORDER - 1; order >= 0; --order) + list_for_each(curr, &zone->free_area[order].free_list) + if (page == list_entry(curr, struct page, lru)) { + spin_unlock_irqrestore(&zone->lock, flags); + return 1 << order; + } + spin_unlock_irqrestore(&zone->lock, flags); + return 0; +} + +/* + * Spill all of this CPU's per-cpu pages back into the buddy allocator. + */ +void drain_local_pages(void) +{ + unsigned long flags; + + local_irq_save(flags); + __drain_pages(smp_processor_id()); + local_irq_restore(flags); +} +#endif /* CONFIG_PM */ + +static void zone_statistics(struct zonelist *zonelist, struct zone *z) +{ +#ifdef CONFIG_NUMA + unsigned long flags; + int cpu; + pg_data_t *pg = z->zone_pgdat; + pg_data_t *orig = zonelist->zones[0]->zone_pgdat; + struct per_cpu_pageset *p; + + local_irq_save(flags); + cpu = smp_processor_id(); + p = &z->pageset[cpu]; + if (pg == orig) { + z->pageset[cpu].numa_hit++; + } else { + p->numa_miss++; + zonelist->zones[0]->pageset[cpu].numa_foreign++; + } + if (pg == NODE_DATA(numa_node_id())) + p->local_node++; + else + p->other_node++; + local_irq_restore(flags); +#endif +} + +/* + * Free a 0-order page + */ +static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); +static void fastcall free_hot_cold_page(struct page *page, int cold) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + unsigned long flags; + + /* XXX Xen: use mapping pointer as skb/data-page destructor */ + if (page->mapping) + return (*(void(*)(struct page *))page->mapping)(page); + + kernel_map_pages(page, 1, 0); + inc_page_state(pgfree); + free_pages_check(__FUNCTION__, page); + pcp = &zone->pageset[get_cpu()].pcp[cold]; + local_irq_save(flags); + if (pcp->count >= pcp->high) + pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + list_add(&page->lru, &pcp->list); + pcp->count++; + local_irq_restore(flags); + put_cpu(); +} + +void fastcall free_hot_page(struct page *page) +{ + free_hot_cold_page(page, 0); +} + +void fastcall free_cold_page(struct page *page) +{ + free_hot_cold_page(page, 1); +} + +/* + * Really, prep_compound_page() should be called from __rmqueue_bulk(). But + * we cheat by calling it from here, in the order > 0 path. Saves a branch + * or two. + */ + +static struct page * +buffered_rmqueue(struct zone *zone, int order, int gfp_flags) +{ + unsigned long flags; + struct page *page = NULL; + int cold = !!(gfp_flags & __GFP_COLD); + + if (order == 0) { + struct per_cpu_pages *pcp; + + pcp = &zone->pageset[get_cpu()].pcp[cold]; + local_irq_save(flags); + if (pcp->count <= pcp->low) + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list); + if (pcp->count) { + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); + pcp->count--; + } + local_irq_restore(flags); + put_cpu(); + } + + if (page == NULL) { + spin_lock_irqsave(&zone->lock, flags); + page = __rmqueue(zone, order); + spin_unlock_irqrestore(&zone->lock, flags); + } + + if (page != NULL) { + BUG_ON(bad_range(zone, page)); + mod_page_state_zone(zone, pgalloc, 1 << order); + prep_new_page(page, order); + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); + } + return page; +} + +/* + * This is the 'heart' of the zoned buddy allocator. + * + * Herein lies the mysterious "incremental min". That's the + * + * local_low = z->pages_low; + * min += local_low; + * + * thing. The intent here is to provide additional protection to low zones for + * allocation requests which _could_ use higher zones. So a GFP_HIGHMEM + * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL + * request. This preserves additional space in those lower zones for requests + * which really do need memory from those zones. It means that on a decent + * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA + * zone untouched. + */ +struct page * fastcall +__alloc_pages(unsigned int gfp_mask, unsigned int order, + struct zonelist *zonelist) +{ + const int wait = gfp_mask & __GFP_WAIT; + unsigned long min; + struct zone **zones; + struct page *page; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + int i; + int alloc_type; + int do_retry; + + might_sleep_if(wait); + + zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ + if (zones[0] == NULL) /* no zones in the zonelist */ + return NULL; + + alloc_type = zone_idx(zones[0]); + + /* Go through the zonelist once, looking for a zone with enough free */ + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + + min = (1<protection[alloc_type]; + + /* + * We let real-time tasks dip their real-time paws a little + * deeper into reserves. + */ + if (rt_task(p)) + min -= z->pages_low >> 1; + + if (z->free_pages >= min || + (!wait && z->free_pages >= z->pages_high)) { + page = buffered_rmqueue(z, order, gfp_mask); + if (page) { + zone_statistics(zonelist, z); + goto got_pg; + } + } + } + + /* we're somewhat low on memory, failed to find what we needed */ + for (i = 0; zones[i] != NULL; i++) + wakeup_kswapd(zones[i]); + + /* Go through the zonelist again, taking __GFP_HIGH into account */ + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + + min = (1<protection[alloc_type]; + + if (gfp_mask & __GFP_HIGH) + min -= z->pages_low >> 2; + if (rt_task(p)) + min -= z->pages_low >> 1; + + if (z->free_pages >= min || + (!wait && z->free_pages >= z->pages_high)) { + page = buffered_rmqueue(z, order, gfp_mask); + if (page) { + zone_statistics(zonelist, z); + goto got_pg; + } + } + } + + /* here we're in the low on memory slow path */ + +rebalance: + if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) { + /* go through the zonelist yet again, ignoring mins */ + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + + page = buffered_rmqueue(z, order, gfp_mask); + if (page) { + zone_statistics(zonelist, z); + goto got_pg; + } + } + goto nopage; + } + + /* Atomic allocations - we can't balance anything */ + if (!wait) + goto nopage; + + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + try_to_free_pages(zones, gfp_mask, order); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; + + /* go through the zonelist yet one more time */ + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + + min = (1UL << order) + z->protection[alloc_type]; + + if (z->free_pages >= min || + (!wait && z->free_pages >= z->pages_high)) { + page = buffered_rmqueue(z, order, gfp_mask); + if (page) { + zone_statistics(zonelist, z); + goto got_pg; + } + } + } + + /* + * Don't let big-order allocations loop unless the caller explicitly + * requests that. Wait for some write requests to complete then retry. + * + * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that + * may not be true in other implementations. + */ + do_retry = 0; + if (!(gfp_mask & __GFP_NORETRY)) { + if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) + do_retry = 1; + if (gfp_mask & __GFP_NOFAIL) + do_retry = 1; + } + if (do_retry) { + blk_congestion_wait(WRITE, HZ/50); + goto rebalance; + } + +nopage: + if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { + printk(KERN_WARNING "%s: page allocation failure." + " order:%d, mode:0x%x\n", + p->comm, order, gfp_mask); + dump_stack(); + } + return NULL; +got_pg: + kernel_map_pages(page, 1 << order, 1); + return page; +} + +EXPORT_SYMBOL(__alloc_pages); + +#ifdef CONFIG_NUMA +/* Early boot: Everything is done by one cpu, but the data structures will be + * used by all cpus - spread them on all nodes. + */ +static __init unsigned long get_boot_pages(unsigned int gfp_mask, unsigned int order) +{ +static int nodenr; + int i = nodenr; + struct page *page; + + for (;;) { + if (i > nodenr + numnodes) + return 0; + if (node_present_pages(i%numnodes)) { + struct zone **z; + /* The node contains memory. Check that there is + * memory in the intended zonelist. + */ + z = NODE_DATA(i%numnodes)->node_zonelists[gfp_mask & GFP_ZONEMASK].zones; + while (*z) { + if ( (*z)->free_pages > (1UL<= 0) + free_hot_cold_page(pvec->pages[i], pvec->cold); +} + +fastcall void __free_pages(struct page *page, unsigned int order) +{ + if (!PageReserved(page) && put_page_testzero(page)) { + if (order == 0) + free_hot_page(page); + else + __free_pages_ok(page, order); + } +} + +EXPORT_SYMBOL(__free_pages); + +fastcall void free_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + BUG_ON(!virt_addr_valid(addr)); + __free_pages(virt_to_page(addr), order); + } +} + +EXPORT_SYMBOL(free_pages); + +/* + * Total amount of free (allocatable) RAM: + */ +unsigned int nr_free_pages(void) +{ + unsigned int sum = 0; + struct zone *zone; + + for_each_zone(zone) + sum += zone->free_pages; + + return sum; +} + +EXPORT_SYMBOL(nr_free_pages); + +unsigned int nr_used_zone_pages(void) +{ + unsigned int pages = 0; + struct zone *zone; + + for_each_zone(zone) + pages += zone->nr_active + zone->nr_inactive; + + return pages; +} + +#ifdef CONFIG_NUMA +unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) +{ + unsigned int i, sum = 0; + + for (i = 0; i < MAX_NR_ZONES; i++) + sum += pgdat->node_zones[i].free_pages; + + return sum; +} +#endif + +static unsigned int nr_free_zone_pages(int offset) +{ + pg_data_t *pgdat; + unsigned int sum = 0; + + for_each_pgdat(pgdat) { + struct zonelist *zonelist = pgdat->node_zonelists + offset; + struct zone **zonep = zonelist->zones; + struct zone *zone; + + for (zone = *zonep++; zone; zone = *zonep++) { + unsigned long size = zone->present_pages; + unsigned long high = zone->pages_high; + if (size > high) + sum += size - high; + } + } + + return sum; +} + +/* + * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL + */ +unsigned int nr_free_buffer_pages(void) +{ + return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); +} + +/* + * Amount of free RAM allocatable within all zones + */ +unsigned int nr_free_pagecache_pages(void) +{ + return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); +} + +#ifdef CONFIG_HIGHMEM +unsigned int nr_free_highpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_pgdat(pgdat) + pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; + + return pages; +} +#endif + +#ifdef CONFIG_NUMA +static void show_node(struct zone *zone) +{ + printk("Node %d ", zone->zone_pgdat->node_id); +} +#else +#define show_node(zone) do { } while (0) +#endif + +/* + * Accumulate the page_state information across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. + */ +DEFINE_PER_CPU(struct page_state, page_states) = {0}; +EXPORT_PER_CPU_SYMBOL(page_states); + +atomic_t nr_pagecache = ATOMIC_INIT(0); +EXPORT_SYMBOL(nr_pagecache); +#ifdef CONFIG_SMP +DEFINE_PER_CPU(long, nr_pagecache_local) = 0; +#endif + +void __get_page_state(struct page_state *ret, int nr) +{ + int cpu = 0; + + memset(ret, 0, sizeof(*ret)); + while (cpu < NR_CPUS) { + unsigned long *in, *out, off; + + if (!cpu_possible(cpu)) { + cpu++; + continue; + } + + in = (unsigned long *)&per_cpu(page_states, cpu); + cpu++; + if (cpu < NR_CPUS && cpu_possible(cpu)) + prefetch(&per_cpu(page_states, cpu)); + out = (unsigned long *)ret; + for (off = 0; off < nr; off++) + *out++ += *in++; + } +} + +void get_page_state(struct page_state *ret) +{ + int nr; + + nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); + nr /= sizeof(unsigned long); + + __get_page_state(ret, nr + 1); +} + +void get_full_page_state(struct page_state *ret) +{ + __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); +} + +unsigned long __read_page_state(unsigned offset) +{ + unsigned long ret = 0; + int cpu; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + unsigned long in; + + if (!cpu_possible(cpu)) + continue; + + in = (unsigned long)&per_cpu(page_states, cpu) + offset; + ret += *((unsigned long *)in); + } + return ret; +} + +void get_zone_counts(unsigned long *active, + unsigned long *inactive, unsigned long *free) +{ + struct zone *zone; + + *active = 0; + *inactive = 0; + *free = 0; + for_each_zone(zone) { + *active += zone->nr_active; + *inactive += zone->nr_inactive; + *free += zone->free_pages; + } +} + +void si_meminfo(struct sysinfo *val) +{ + val->totalram = totalram_pages; + val->sharedram = 0; + val->freeram = nr_free_pages(); + val->bufferram = nr_blockdev_pages(); +#ifdef CONFIG_HIGHMEM + val->totalhigh = totalhigh_pages; + val->freehigh = nr_free_highpages(); +#else + val->totalhigh = 0; + val->freehigh = 0; +#endif + val->mem_unit = PAGE_SIZE; +} + +EXPORT_SYMBOL(si_meminfo); + +#ifdef CONFIG_NUMA +void si_meminfo_node(struct sysinfo *val, int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + val->totalram = pgdat->node_present_pages; + val->freeram = nr_free_pages_pgdat(pgdat); + val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; + val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; + val->mem_unit = PAGE_SIZE; +} +#endif + +#define K(x) ((x) << (PAGE_SHIFT-10)) + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas(void) +{ + struct page_state ps; + int cpu, temperature; + unsigned long active; + unsigned long inactive; + unsigned long free; + struct zone *zone; + + for_each_zone(zone) { + show_node(zone); + printk("%s per-cpu:", zone->name); + + if (!zone->present_pages) { + printk(" empty\n"); + continue; + } else + printk("\n"); + + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + struct per_cpu_pageset *pageset; + + if (!cpu_possible(cpu)) + continue; + + pageset = zone->pageset + cpu; + + for (temperature = 0; temperature < 2; temperature++) + printk("cpu %d %s: low %d, high %d, batch %d\n", + cpu, + temperature ? "cold" : "hot", + pageset->pcp[temperature].low, + pageset->pcp[temperature].high, + pageset->pcp[temperature].batch); + } + } + + get_page_state(&ps); + get_zone_counts(&active, &inactive, &free); + + printk("\nFree pages: %11ukB (%ukB HighMem)\n", + K(nr_free_pages()), + K(nr_free_highpages())); + + printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " + "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", + active, + inactive, + ps.nr_dirty, + ps.nr_writeback, + ps.nr_unstable, + nr_free_pages(), + ps.nr_slab, + ps.nr_mapped, + ps.nr_page_table_pages); + + for_each_zone(zone) { + int i; + + show_node(zone); + printk("%s" + " free:%lukB" + " min:%lukB" + " low:%lukB" + " high:%lukB" + " active:%lukB" + " inactive:%lukB" + " present:%lukB" + "\n", + zone->name, + K(zone->free_pages), + K(zone->pages_min), + K(zone->pages_low), + K(zone->pages_high), + K(zone->nr_active), + K(zone->nr_inactive), + K(zone->present_pages) + ); + printk("protections[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(" %lu", zone->protection[i]); + printk("\n"); + } + + for_each_zone(zone) { + struct list_head *elem; + unsigned long nr, flags, order, total = 0; + + show_node(zone); + printk("%s: ", zone->name); + if (!zone->present_pages) { + printk("empty\n"); + continue; + } + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + nr = 0; + list_for_each(elem, &zone->free_area[order].free_list) + ++nr; + total += nr << order; + printk("%lu*%lukB ", nr, K(1UL) << order); + } + spin_unlock_irqrestore(&zone->lock, flags); + printk("= %lukB\n", K(total)); + } + + show_swap_cache_info(); +} + +/* + * Builds allocation fallback zone lists. + */ +static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) +{ + switch (k) { + struct zone *zone; + default: + BUG(); + case ZONE_HIGHMEM: + zone = pgdat->node_zones + ZONE_HIGHMEM; + if (zone->present_pages) { +#ifndef CONFIG_HIGHMEM + BUG(); +#endif + zonelist->zones[j++] = zone; + } + case ZONE_NORMAL: + zone = pgdat->node_zones + ZONE_NORMAL; + if (zone->present_pages) + zonelist->zones[j++] = zone; + case ZONE_DMA: + zone = pgdat->node_zones + ZONE_DMA; + if (zone->present_pages) + zonelist->zones[j++] = zone; + } + + return j; +} + +#ifdef CONFIG_NUMA +#define MAX_NODE_LOAD (numnodes) +static int __initdata node_load[MAX_NUMNODES]; +/** + * find_next_best_node - find the next node that should appear in a given + * node's fallback list + * @node: node whose fallback list we're appending + * @used_node_mask: pointer to the bitmap of already used nodes + * + * We use a number of factors to determine which is the next node that should + * appear on a given node's fallback list. The node should not have appeared + * already in @node's fallback list, and it should be the next closest node + * according to the distance array (which contains arbitrary distance values + * from each node to each node in the system), and should also prefer nodes + * with no CPUs, since presumably they'll have very little allocation pressure + * on them otherwise. + * It returns -1 if no node is found. + */ +static int __init find_next_best_node(int node, void *used_node_mask) +{ + int i, n, val; + int min_val = INT_MAX; + int best_node = -1; + + for (i = 0; i < numnodes; i++) { + cpumask_t tmp; + + /* Start from local node */ + n = (node+i)%numnodes; + + /* Don't want a node to appear more than once */ + if (test_bit(n, used_node_mask)) + continue; + + /* Use the distance array to find the distance */ + val = node_distance(node, n); + + /* Give preference to headless and unused nodes */ + tmp = node_to_cpumask(n); + if (!cpus_empty(tmp)) + val += PENALTY_FOR_NODE_WITH_CPUS; + + /* Slight preference for less loaded node */ + val *= (MAX_NODE_LOAD*MAX_NUMNODES); + val += node_load[n]; + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + if (best_node >= 0) + set_bit(best_node, used_node_mask); + + return best_node; +} + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + int prev_node, load; + struct zonelist *zonelist; + DECLARE_BITMAP(used_mask, MAX_NUMNODES); + + /* initialize zonelists */ + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + i; + memset(zonelist, 0, sizeof(*zonelist)); + zonelist->zones[0] = NULL; + } + + /* NUMA-aware ordering of nodes */ + local_node = pgdat->node_id; + load = numnodes; + prev_node = local_node; + bitmap_zero(used_mask, MAX_NUMNODES); + while ((node = find_next_best_node(local_node, used_mask)) >= 0) { + /* + * We don't want to pressure a particular node. + * So adding penalty to the first node in same + * distance group to make it round-robin. + */ + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) + node_load[node] += load; + prev_node = node; + load--; + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + i; + for (j = 0; zonelist->zones[j] != NULL; j++); + + k = ZONE_NORMAL; + if (i & __GFP_HIGHMEM) + k = ZONE_HIGHMEM; + if (i & __GFP_DMA) + k = ZONE_DMA; + + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + zonelist->zones[j] = NULL; + } + } +} + +#else /* CONFIG_NUMA */ + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + + local_node = pgdat->node_id; + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zonelist *zonelist; + + zonelist = pgdat->node_zonelists + i; + memset(zonelist, 0, sizeof(*zonelist)); + + j = 0; + k = ZONE_NORMAL; + if (i & __GFP_HIGHMEM) + k = ZONE_HIGHMEM; + if (i & __GFP_DMA) + k = ZONE_DMA; + + j = build_zonelists_node(pgdat, zonelist, j, k); + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < numnodes; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + for (node = 0; node < local_node; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + + zonelist->zones[j] = NULL; + } +} + +#endif /* CONFIG_NUMA */ + +void __init build_all_zonelists(void) +{ + int i; + + for(i = 0 ; i < numnodes ; i++) + build_zonelists(NODE_DATA(i)); + printk("Built %i zonelists\n", numnodes); +} + +/* + * Helper functions to size the waitqueue hash table. + * Essentially these want to choose hash table sizes sufficiently + * large so that collisions trying to wait on pages are rare. + * But in fact, the number of active page waitqueues on typical + * systems is ridiculously low, less than 200. So this is even + * conservative, even though it seems large. + * + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to + * waitqueues, i.e. the size of the waitq table given the number of pages. + */ +#define PAGES_PER_WAITQUEUE 256 + +static inline unsigned long wait_table_size(unsigned long pages) +{ + unsigned long size = 1; + + pages /= PAGES_PER_WAITQUEUE; + + while (size < pages) + size <<= 1; + + /* + * Once we have dozens or even hundreds of threads sleeping + * on IO we've got bigger problems than wait queue collision. + * Limit the size of the wait table to a reasonable size. + */ + size = min(size, 4096UL); + + return max(size, 4UL); +} + +/* + * This is an integer logarithm so that shifts can be used later + * to extract the more random high bits from the multiplicative + * hash function before the remainder is taken. + */ +static inline unsigned long wait_table_bits(unsigned long size) +{ + return ffz(~size); +} + +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) + +static void __init calculate_zone_totalpages(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zones_size[i]; + pgdat->node_spanned_pages = totalpages; + + realtotalpages = totalpages; + if (zholes_size) + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= zholes_size[i]; + pgdat->node_present_pages = realtotalpages; + printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); +} + + +/* + * Initially all pages are reserved - free ones are freed + * up by free_all_bootmem() once the early boot process is + * done. Non-atomic initialization, single-pass. + */ +void __init memmap_init_zone(struct page *start, unsigned long size, int nid, + unsigned long zone, unsigned long start_pfn) +{ + struct page *page; + + for (page = start; page < (start + size); page++) { + set_page_zone(page, NODEZONE(nid, zone)); + set_page_count(page, 0); + SetPageReserved(page); + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (zone != ZONE_HIGHMEM) + set_page_address(page, __va(start_pfn << PAGE_SHIFT)); +#endif + start_pfn++; + } +} + +#ifndef __HAVE_ARCH_MEMMAP_INIT +#define memmap_init(start, size, nid, zone, start_pfn) \ + memmap_init_zone((start), (size), (nid), (zone), (start_pfn)) +#endif + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + */ +static void __init free_area_init_core(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long i, j; + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + int cpu, nid = pgdat->node_id; + struct page *lmem_map = pgdat->node_mem_map; + unsigned long zone_start_pfn = pgdat->node_start_pfn; + + pgdat->nr_zones = 0; + init_waitqueue_head(&pgdat->kswapd_wait); + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long size, realsize; + unsigned long batch; + + zone_table[NODEZONE(nid, j)] = zone; + realsize = size = zones_size[j]; + if (zholes_size) + realsize -= zholes_size[j]; + + zone->spanned_pages = size; + zone->present_pages = realsize; + zone->name = zone_names[j]; + spin_lock_init(&zone->lock); + spin_lock_init(&zone->lru_lock); + zone->zone_pgdat = pgdat; + zone->free_pages = 0; + + zone->temp_priority = zone->prev_priority = DEF_PRIORITY; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/4 of a meg - there's + * no point in going beyond the size of L2 cache. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 256 * 1024) + batch = (256 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct per_cpu_pages *pcp; + + pcp = &zone->pageset[cpu].pcp[0]; /* hot */ + pcp->count = 0; + pcp->low = 2 * batch; + pcp->high = 6 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + + pcp = &zone->pageset[cpu].pcp[1]; /* cold */ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + } + printk(" %s zone: %lu pages, LIFO batch:%lu\n", + zone_names[j], realsize, batch); + INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->inactive_list); + atomic_set(&zone->nr_scan_active, 0); + atomic_set(&zone->nr_scan_inactive, 0); + zone->nr_active = 0; + zone->nr_inactive = 0; + if (!size) + continue; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(size); + zone->wait_table_bits = + wait_table_bits(zone->wait_table_size); + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size + * sizeof(wait_queue_head_t)); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); + + pgdat->nr_zones = j+1; + + zone->zone_mem_map = lmem_map; + zone->zone_start_pfn = zone_start_pfn; + + if ((zone_start_pfn) & (zone_required_alignment-1)) + printk("BUG: wrong zone alignment, it will crash\n"); + + memmap_init(lmem_map, size, nid, j, zone_start_pfn); + + zone_start_pfn += size; + lmem_map += size; + + for (i = 0; ; i++) { + unsigned long bitmap_size; + + INIT_LIST_HEAD(&zone->free_area[i].free_list); + if (i == MAX_ORDER-1) { + zone->free_area[i].map = NULL; + break; + } + + /* + * Page buddy system uses "index >> (i+1)", + * where "index" is at most "size-1". + * + * The extra "+3" is to round down to byte + * size (8 bits per byte assumption). Thus + * we get "(size-1) >> (i+4)" as the last byte + * we can access. + * + * The "+1" is because we want to round the + * byte allocation up rather than down. So + * we should have had a "+7" before we shifted + * down by three. Also, we have to add one as + * we actually _use_ the last bit (it's [0,n] + * inclusive, not [0,n[). + * + * So we actually had +7+1 before we shift + * down by 3. But (n+8) >> 3 == (n >> 3) + 1 + * (modulo overflows, which we do not have). + * + * Finally, we LONG_ALIGN because all bitmap + * operations are on longs. + */ + bitmap_size = (size-1) >> (i+4); + bitmap_size = LONG_ALIGN(bitmap_size+1); + zone->free_area[i].map = + (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); + } + } +} + +void __init free_area_init_node(int nid, struct pglist_data *pgdat, + struct page *node_mem_map, unsigned long *zones_size, + unsigned long node_start_pfn, unsigned long *zholes_size) +{ + unsigned long size; + + pgdat->node_id = nid; + pgdat->node_start_pfn = node_start_pfn; + calculate_zone_totalpages(pgdat, zones_size, zholes_size); + if (!node_mem_map) { + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + node_mem_map = alloc_bootmem_node(pgdat, size); + } + pgdat->node_mem_map = node_mem_map; + + free_area_init_core(pgdat, zones_size, zholes_size); +} + +#ifndef CONFIG_DISCONTIGMEM +static bootmem_data_t contig_bootmem_data; +struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; + +EXPORT_SYMBOL(contig_page_data); + +void __init free_area_init(unsigned long *zones_size) +{ + free_area_init_node(0, &contig_page_data, NULL, zones_size, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + mem_map = contig_page_data.node_mem_map; +} +#endif + +#ifdef CONFIG_PROC_FS + +#include + +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + + for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return pgdat->pgdat_next; +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* + * This walks the freelist for each zone. Whilst this is slow, I'd rather + * be slow here than slow down the fast path by keeping stats - mjbligh + */ +static int frag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + int order; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!zone->present_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + unsigned long nr_bufs = 0; + struct list_head *elem; + + list_for_each(elem, &(zone->free_area[order].free_list)) + ++nr_bufs; + seq_printf(m, "%6lu ", nr_bufs); + } + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations fragmentation_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; + +static char *vmstat_text[] = { + "nr_dirty", + "nr_writeback", + "nr_unstable", + "nr_page_table_pages", + "nr_mapped", + "nr_slab", + + "pgpgin", + "pgpgout", + "pswpin", + "pswpout", + "pgalloc_high", + + "pgalloc_normal", + "pgalloc_dma", + "pgfree", + "pgactivate", + "pgdeactivate", + + "pgfault", + "pgmajfault", + "pgrefill_high", + "pgrefill_normal", + "pgrefill_dma", + + "pgsteal_high", + "pgsteal_normal", + "pgsteal_dma", + "pgscan_kswapd_high", + "pgscan_kswapd_normal", + + "pgscan_kswapd_dma", + "pgscan_direct_high", + "pgscan_direct_normal", + "pgscan_direct_dma", + "pginodesteal", + + "slabs_scanned", + "kswapd_steal", + "kswapd_inodesteal", + "pageoutrun", + "allocstall", + + "pgrotated", +}; + +static void *vmstat_start(struct seq_file *m, loff_t *pos) +{ + struct page_state *ps; + + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + + ps = kmalloc(sizeof(*ps), GFP_KERNEL); + m->private = ps; + if (!ps) + return ERR_PTR(-ENOMEM); + get_full_page_state(ps); + ps->pgpgin /= 2; /* sectors -> kbytes */ + ps->pgpgout /= 2; + return (unsigned long *)ps + *pos; +} + +static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) +{ + (*pos)++; + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + return (unsigned long *)m->private + *pos; +} + +static int vmstat_show(struct seq_file *m, void *arg) +{ + unsigned long *l = arg; + unsigned long off = l - (unsigned long *)m->private; + + seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + return 0; +} + +static void vmstat_stop(struct seq_file *m, void *arg) +{ + kfree(m->private); + m->private = NULL; +} + +struct seq_operations vmstat_op = { + .start = vmstat_start, + .next = vmstat_next, + .stop = vmstat_stop, + .show = vmstat_show, +}; + +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_HOTPLUG_CPU +static int page_alloc_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + long *count; + + if (action == CPU_DEAD) { + /* Drain local pagecache count. */ + count = &per_cpu(nr_pagecache_local, cpu); + atomic_add(*count, &nr_pagecache); + *count = 0; + local_irq_disable(); + __drain_pages(cpu); + local_irq_enable(); + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +void __init page_alloc_init(void) +{ + hotcpu_notifier(page_alloc_cpu_notify, 0); +} + +static unsigned long higherzone_val(struct zone *z, int max_zone, + int alloc_type) +{ + int z_idx = zone_idx(z); + struct zone *higherzone; + unsigned long pages; + + /* there is no higher zone to get a contribution from */ + if (z_idx == MAX_NR_ZONES-1) + return 0; + + higherzone = &z->zone_pgdat->node_zones[z_idx+1]; + + /* We always start with the higher zone's protection value */ + pages = higherzone->protection[alloc_type]; + + /* + * We get a lower-zone-protection contribution only if there are + * pages in the higher zone and if we're not the highest zone + * in the current zonelist. e.g., never happens for GFP_DMA. Happens + * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA + * and ZONE_NORMAL for a GFP_HIGHMEM allocation. + */ + if (higherzone->present_pages && z_idx < alloc_type) + pages += higherzone->pages_low * sysctl_lower_zone_protection; + + return pages; +} + +/* + * setup_per_zone_protection - called whenver min_free_kbytes or + * sysctl_lower_zone_protection changes. Ensures that each zone + * has a correct pages_protected value, so an adequate number of + * pages are left in the zone after a successful __alloc_pages(). + * + * This algorithm is way confusing. I tries to keep the same behavior + * as we had with the incremental min iterative algorithm. + */ +static void setup_per_zone_protection(void) +{ + struct pglist_data *pgdat; + struct zone *zones, *zone; + int max_zone; + int i, j; + + for_each_pgdat(pgdat) { + zones = pgdat->node_zones; + + for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++) + if (zones[i].present_pages) + max_zone = i; + + /* + * For each of the different allocation types: + * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM + */ + for (i = 0; i < MAX_NR_ZONES; i++) { + /* + * For each of the zones: + * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA + */ + for (j = MAX_NR_ZONES-1; j >= 0; j--) { + zone = &zones[j]; + + /* + * We never protect zones that don't have memory + * in them (j>max_zone) or zones that aren't in + * the zonelists for a certain type of + * allocation (j>i). We have to assign these to + * zero because the lower zones take + * contributions from the higher zones. + */ + if (j > max_zone || j > i) { + zone->protection[i] = 0; + continue; + } + /* + * The contribution of the next higher zone + */ + zone->protection[i] = higherzone_val(zone, + max_zone, i); + zone->protection[i] += zone->pages_low; + } + } + } +} + +/* + * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures + * that the pages_{min,low,high} values for each zone are set correctly + * with respect to min_free_kbytes. + */ +static void setup_per_zone_pages_min(void) +{ + unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long lowmem_pages = 0; + struct zone *zone; + unsigned long flags; + + /* Calculate total number of !ZONE_HIGHMEM pages */ + for_each_zone(zone) { + if (!is_highmem(zone)) + lowmem_pages += zone->present_pages; + } + + for_each_zone(zone) { + spin_lock_irqsave(&zone->lru_lock, flags); + if (is_highmem(zone)) { + /* + * Often, highmem doesn't need to reserve any pages. + * But the pages_min/low/high values are also used for + * batching up page reclaim activity so we need a + * decent value here. + */ + int min_pages; + + min_pages = zone->present_pages / 1024; + if (min_pages < SWAP_CLUSTER_MAX) + min_pages = SWAP_CLUSTER_MAX; + if (min_pages > 128) + min_pages = 128; + zone->pages_min = min_pages; + } else { + /* if it's a lowmem zone, reserve a number of pages + * proportionate to the zone's size. + */ + zone->pages_min = (pages_min * zone->present_pages) / + lowmem_pages; + } + + zone->pages_low = zone->pages_min * 2; + zone->pages_high = zone->pages_min * 3; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } +} + +/* + * Initialise min_free_kbytes. + * + * For small machines we want it small (128k min). For large machines + * we want it large (16MB max). But it is not linear, because network + * bandwidth does not increase linearly with machine size. We use + * + * min_free_kbytes = sqrt(lowmem_kbytes) + * + * which yields + * + * 16MB: 128k + * 32MB: 181k + * 64MB: 256k + * 128MB: 362k + * 256MB: 512k + * 512MB: 724k + * 1024MB: 1024k + * 2048MB: 1448k + * 4096MB: 2048k + * 8192MB: 2896k + * 16384MB: 4096k + */ +static int __init init_per_zone_pages_min(void) +{ + unsigned long lowmem_kbytes; + + lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); + + min_free_kbytes = int_sqrt(lowmem_kbytes); + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 16384) + min_free_kbytes = 16384; + setup_per_zone_pages_min(); + setup_per_zone_protection(); + return 0; +} +module_init(init_per_zone_pages_min) + +/* + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call two helper functions whenever min_free_kbytes + * changes. + */ +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length) +{ + proc_dointvec(table, write, file, buffer, length); + setup_per_zone_pages_min(); + setup_per_zone_protection(); + return 0; +} + +/* + * lower_zone_protection_sysctl_handler - just a wrapper around + * proc_dointvec() so that we can call setup_per_zone_protection() + * whenever sysctl_lower_zone_protection changes. + */ +int lower_zone_protection_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length) +{ + proc_dointvec_minmax(table, write, file, buffer, length); + setup_per_zone_protection(); + return 0; +} -- 2.30.2